
#############################################################
### Geographic Representativeness of Legislatures Project ###
###                                                       ###
###           Leonardo Carella, Andrew Eggers             ###
#############################################################

# This is the FIRST script to run to replicate the cross-country analysis. 
# It creates gridded population datasets *by country*, 
# with population estimates for 2005 and mean legislator birthyear
# aggregated at the 15-arcminute side grid size. 

# The code requires two datasets as inputs:
# 1. the raw world population_histsoc_5min_annual_1861-2005.nc dataset (HYDE 3.2)
# 2. the surli_crosscountry_variables.rds file, containing country names and benchmark year

# The code produces two datasets as outputs:
# 1. "grids_2005_15_arcmin.rds", to be used to compute 2005 SURLI
# 2. "grids_complete.rds", to be used to compute MP birthyear benchmark SURLI

#### Packages ####

library(maps)
library(sf)
library(tidyverse)
library(sp)
library(ggmap)
library(haven)
library(stringi)
library(readxl)
library(raster)
library(mapdata)
library(Rmisc)
library(gridExtra)
library(rgdal)
library(geosphere)
library(Matrix)
library(parallel)
library(move)
library(patchwork)
library(ncdf4)

df <- readRDS("surli_crosscountry_variables.rds")
country_vector <- unique(df$country)

#### create 2005 dataset ####

for (w in c(2005)){
  
  # set parameters
  year_parameter <- w
  grid_size_parameter <- 15 #in arcminutes must be a multiple of 5
  countries_parameter <- country_vector
  
  #get the big raster brick, subset by year
  
  popdata <- brick("population_histsoc_5min_annual_1861-2005.nc") 
  subset_grids_original <- subset(popdata, year_parameter-1860)
  #save it as an ascii file, convert to long-format dataset
  writeRaster(subset_grids_original, filename="subset_grids_original.asc", format = "ascii", datatype='INT4S', overwrite=TRUE)
  subset_grids <- as.data.frame(read.asciigrid("subset_grids_original.asc"))
  #associate a country with each (small) grid centroid
  subset_grids$Country <- map.where(database = "world", subset_grids$s1, subset_grids$s2)
  subset_grids <- as.data.frame(separate(subset_grids, Country, into = c("CountryNAME", "Other_Location"), sep = ":", fill = "right", extra = "drop"))
  
  subset_grids$CountryNAME[subset_grids$Other_Location == "Northern Cyprus"] <- NA
  subset_grids$CountryNAME[subset_grids$CountryNAME == "North Macedonia"] <- "Macedonia"
  
  # create a lookup file with country names and factors 
  # (the raster layer doesn't handle strings, or at least 
  # I can't force it to)
  
  countryname <- unique(subset_grids$CountryNAME)
  countryfactor <- unique(as.numeric(as.factor(subset_grids$CountryNAME)))
  lookup <- cbind.data.frame(countryname, countryfactor) %>%
    dplyr::filter(!is.na(countryname))
  
  #create a multi-layer raster with one layer for population 
  # and one layer for country code
  
  xyz <- rasterToPoints(subset_grids_original)
  xyzz <- cbind.data.frame(xyz,country = as.numeric(as.factor(subset_grids$CountryNAME)), stringsAsFactors = F)
  master_raster <- rasterFromXYZ(xyzz)
  
  #save it and stack it
  
  writeRaster(master_raster, filename="master_raster.nc", format="CDF", overwrite=TRUE)   
  master_raster <- stack("master_raster.nc") 
  
  # note: If you don't want to vary the year, 
  # but only country or grid size, you can restart
  # from here, using the same master_raster file 
  
  # function to extract the two layers, modify the population layer so that 
  # it takes value = 0 if it's not in the country you want, aggregate by
  # desired grid size, retain only grids with non-0 population 
  
  get_grids <- function(country, factor) {
    num_code <- lookup$countryfactor[[which(lookup$countryname == country)]]
    master_raster2 <- master_raster
    df <- cbind.data.frame(as.vector(master_raster2[[1,]]), as.vector(master_raster2[[2,]]))
    df[,1][df[,2] != num_code | is.na(df[,2])] <- 0
    #plug the new vector back in
    master_raster2[[1,]] <- df[,1]
    #now aggregate at the grid size you want (1 is 5 arcmins, 2 is 10 arcmins etc.)
    country_raster <- raster::aggregate(master_raster2, fact = factor, fun=sum)
    #extract the population layer
    country_raster <- subset(country_raster, 1)
    writeRaster(country_raster, filename="country_raster.asc", format = "ascii", datatype='INT4S', overwrite=TRUE)
    country_raster <- as.data.frame(read.asciigrid("country_raster.asc")) %>%
      dplyr::filter(country_raster.asc > 0) %>%
      dplyr::rename(pop = country_raster.asc)
    return(country_raster)}
  
  grids_dataframe <- data.frame(pop = NA, s1 = NA, s2 = NA, country = NA)
  system.time(for (i in countries_parameter) {
    out_vec <- get_grids(i, grid_size_parameter/5)
    out_vec$country <- paste(i)
    grids_dataframe <- rbind(grids_dataframe,out_vec) %>%
      dplyr::filter(!is.na(pop))
    print(paste("Completed", i, sep = " "))
  })
  
  #### save resulting dataset as a file in the format grids_year_gridsize.rds, with grid IDs, year and gridsize information ####
  
  grids_dataframe$gID <- 1:length(grids_dataframe$pop)
  grids_dataframe$year <- year_parameter
  grids_dataframe$gridsize <- grid_size_parameter
  grids_dataframe <- grids_dataframe %>%
    dplyr::select(gID, pop, s1, s2, country, gridsize, year)
  
  write_rds(grids_dataframe, paste("grids", year_parameter, grid_size_parameter, "arcmin.rds", sep = "_"))
  
  #### remove files created in the process ####
  
  file.remove("subset_grids_original.asc")
  file.remove("master_raster.nc")
  file.remove("country_raster.asc")}

#### Create birthyear dataset ####

all_years <- sort(unique(df$benchmark_year))

for (w in all_years){
  
  # set parameters
  year_parameter <- w
  grid_size_parameter <- 15 #in arcminutes must be a multiple of 5
  countries_parameter <- df$country[df$benchmark_year == w]
  
  #get the big raster brick, subset by year
  
  popdata <- brick("population_histsoc_5min_annual_1861-2005.nc") 
  subset_grids_original <- subset(popdata, year_parameter-1860)
  #save it as an ascii file, convert to long-format dataset
  writeRaster(subset_grids_original, filename="subset_grids_original.asc", format = "ascii", datatype='INT4S', overwrite=TRUE)
  subset_grids <- as.data.frame(read.asciigrid("subset_grids_original.asc"))
  #associate a country with each (small) grid centroid
  subset_grids$Country <- map.where(database = "world", subset_grids$s1, subset_grids$s2)
  subset_grids <- as.data.frame(separate(subset_grids, Country, into = c("CountryNAME", "Other_Location"), sep = ":", fill = "right", extra = "drop"))
  
  subset_grids$CountryNAME[subset_grids$Other_Location == "Northern Cyprus"] <- NA
  subset_grids$CountryNAME[subset_grids$CountryNAME == "North Macedonia"] <- "Macedonia"
  
  # create a lookup file with country names and factors 
  # (the raster layer doesn't handle strings, or at least 
  # I can't force it to)
  
  countryname <- unique(subset_grids$CountryNAME)
  countryfactor <- unique(as.numeric(as.factor(subset_grids$CountryNAME)))
  lookup <- cbind.data.frame(countryname, countryfactor) %>%
    dplyr::filter(!is.na(countryname))
  
  #create a multi-layer raster with one layer for population 
  # and one layer for country code
  
  xyz <- rasterToPoints(subset_grids_original)
  xyzz <- cbind.data.frame(xyz,country = as.numeric(as.factor(subset_grids$CountryNAME)), stringsAsFactors = F)
  master_raster <- rasterFromXYZ(xyzz)
  
  #save it and stack it
  
  writeRaster(master_raster, filename="master_raster.nc", format="CDF", overwrite=TRUE)   
  master_raster <- stack("master_raster.nc") 
  
  # note: If you don't want to vary the year, 
  # but only country or grid size, you can restart
  # from here, using the same master_raster file 
  
  # function to extract the two layers, modify the population layer so that 
  # it takes value = 0 if it's not in the country you want, aggregate by
  # desired grid size, retain only grids with non-0 population 
  
  get_grids <- function(country, factor) {
    num_code <- lookup$countryfactor[[which(lookup$countryname == country)]]
    master_raster2 <- master_raster
    df <- cbind.data.frame(as.vector(master_raster2[[1,]]), as.vector(master_raster2[[2,]]))
    df[,1][df[,2] != num_code | is.na(df[,2])] <- 0
    #plug the new vector back in
    master_raster2[[1,]] <- df[,1]
    #now aggregate at the grid size you want (1 is 5 arcmins, 2 is 10 arcmins etc.)
    country_raster <- raster::aggregate(master_raster2, fact = factor, fun=sum)
    #extract the population layer
    country_raster <- subset(country_raster, 1)
    writeRaster(country_raster, filename="country_raster.asc", format = "ascii", datatype='INT4S', overwrite=TRUE)
    country_raster <- as.data.frame(read.asciigrid("country_raster.asc")) %>%
      dplyr::filter(country_raster.asc > 0) %>%
      dplyr::rename(pop = country_raster.asc)
    return(country_raster)}
  
  grids_dataframe <- data.frame(pop = NA, s1 = NA, s2 = NA, country = NA)
  system.time(for (i in countries_parameter) {
    out_vec <- get_grids(i, grid_size_parameter/5)
    out_vec$country <- paste(i)
    grids_dataframe <- rbind(grids_dataframe,out_vec) %>%
      dplyr::filter(!is.na(pop))
    print(paste("Completed", i, sep = " "))
  })
  
  #### save resulting dataset as a file in the format grids_year_gridsize.rds, with grid IDs, year and gridsize information ####
  
  grids_dataframe$gID <- 1:length(grids_dataframe$pop)
  grids_dataframe$year <- year_parameter
  grids_dataframe$gridsize <- grid_size_parameter
  grids_dataframe <- grids_dataframe %>%
    dplyr::select(gID, pop, s1, s2, country, gridsize, year)
  
  write_rds(grids_dataframe, paste("grids", year_parameter, grid_size_parameter, "arcmin.rds", sep = "_"))
  
  print(paste("grids", year_parameter, grid_size_parameter, "arcmin.rds", sep = "_"))
  #### remove files created in the process ####
  
  file.remove("subset_grids_original.asc")
  file.remove("master_raster.nc")
  file.remove("country_raster.asc")}


grids_complete <- rbind.data.frame(
readRDS("grids_1951_15_arcmin.rds"),
readRDS("grids_1952_15_arcmin.rds"),
readRDS("grids_1953_15_arcmin.rds"),
readRDS("grids_1954_15_arcmin.rds"),
readRDS("grids_1955_15_arcmin.rds"),
readRDS("grids_1956_15_arcmin.rds"),
readRDS("grids_1957_15_arcmin.rds"),
readRDS("grids_1958_15_arcmin.rds"),
readRDS("grids_1959_15_arcmin.rds"),
readRDS("grids_1960_15_arcmin.rds"),
readRDS("grids_1961_15_arcmin.rds"),
readRDS("grids_1962_15_arcmin.rds"),
readRDS("grids_1963_15_arcmin.rds"),
readRDS("grids_1964_15_arcmin.rds"),
readRDS("grids_1965_15_arcmin.rds"),
readRDS("grids_1966_15_arcmin.rds"),
readRDS("grids_1967_15_arcmin.rds")
)

write_rds(grids_complete, "grids_complete.rds")

